load required packages

library(tidyverse)
library(ade4) # Coinertia
library(vegan) # Rarefaction
library(ggrepel)
library(metacoder)
## Warning: le package 'metacoder' a été compilé avec la version R 4.3.3
main_theme = theme_minimal()+
  theme(line = element_blank(), 
        axis.line = element_line(colour = "black"),
        panel.border = element_blank(),
        axis.ticks =  element_line(colour = "black"),
        axis.text.x = element_text(colour = "black", size=22, face="italic", angle = 45, vjust = 1, hjust = 1),
        axis.text.y = element_text(colour = "black", size=22, face="italic"),
        legend.title = element_text(colour = "black", size=20,
                                    hjust =0.5),
        
        legend.text = element_text(colour = "black", size=18),
        axis.title= element_text(size=28),
        strip.text = element_text(colour = "black", size=15, face ="italic"))

Data path

metab1 = "data/metabarcoding_vs_metagenomics/oracle_metaBt0_16S_samples_run_20190715_kraken2_assignment_genuses.tsv"
#metab2 = "data/metabarcoding_vs_metagenomics/oracle_metaBt0_16S_samples_run_20200106_kraken2_assignment_genuses.tsv"
metag1 = "data/metabarcoding_vs_metagenomics/oracle_metaGt0_SAMA_12_samples_kraken2_SSU_assignment_genuses.tsv"
#metag2 = "data/metabarcoding_vs_metagenomics/oracle_metaGt0_SAMA_21_1_samples_kraken2_SSU_assignment_genuses.tsv"
#metag3 = "data/metabarcoding_vs_metagenomics/oracle_metaGt0_SAMA_21_2_samples_kraken2_SSU_assignment_genuses.tsv"

Load data

Set up function to load and pepare data

taxonomic_levels <- c("kingdom", "phylum", "class", "order", "family", 
                      "genus", "species")
## ---- Arrange metaB ----

# Select data from bracken method, and rename columns 
select_and_rename_cols = function(tab, method = "G"){
  metag1%>%
    read_tsv(col_names = T, skip = 1)%>%
    select(which(str_detect(colnames(.),"bracken_genuses")), "#OTU ID", "taxonomy")%>%
    rename_with( # rename colnames
      .%>%
      str_remove( "_bracken_genuses")%>% 
      str_replace_all("^OR-", "BU_") %>% 
      str_replace_all("-", "_")%>%
      str_remove("_S\\d+")%>%
      str_replace("((?<!.)T_)(\\d)", "CONT_BU_PCR_\\2")%>%
      str_replace("(BU_T_extr)", "CONT_BU_ext")%>%
      str_replace("#OTU ID", "OTU")%>%
      str_replace("(BU_[A-Z]+_)(\\d{1})(_R\\d+)", "\\10\\2\\3"
                  ))%>% # add 0 before 1-Digit Numbers of soil code
    rename_with(~ paste0(., "_", method), contains("BU"))%>% # add B or G at the end of soil code depending on the method 
      
    separate_wider_delim(taxonomy, names = taxonomic_levels, delim = "; ", cols_remove = F)%>% # Separate taxonomy colums in 6 colums of taxonomic levels
    filter(kingdom == "k__Bacteria")
}

Decontamination function

Compute the total number of reads for each OTU present in control samples. That sum is then substracted from all occurrences of that OTU in true samples. The rationale is as follows:

  • if an OTU is abundant in control samples, but rare in true samples, then it is a contamination specific to the control samples, and it will be eliminated by the substraction (i.e, final abundance is 0),
  • if an OTU is present in control samples, and present in true samples (systematic contamination, will be mitigated by the substraction),
  • if an OTU is rare in control samples, but abundant in true samples (cross-talk, will be eliminated/mitigated by the substraction)

Control samples can be eliminated from the statistical analysis after the substraction (all OTUs present in control samples have been zeroed out).

.%>%
  #select(!ends_with(run_rm))%>%
  replace(. == 0, NA) %>%
  pivot_longer(starts_with("BU"), names_to = "samples", values_to = "reads") %>%
  filter(!is.na(reads)) %>%
  #{{merge control samples}}
  mutate( n = rowSums(across(starts_with("CONT")), na.rm = T))%>%
  #{{substract abundance of control samples}}
  mutate(reads = case_when(
    is.na(n)  ~ reads,
    n > reads ~ 0,
    TRUE      ~ reads - n)) %>%
  select(-n,-starts_with("CONT"))%>%
  pivot_wider(values_from = reads, names_from = samples, values_fill = 0) -> decontaminate

Load and format data for coinertia

metab1%>%
 select_and_rename_cols("B")%>%
 decontaminate -> metab1_decont_table
## Rows: 4826 Columns: 318
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr   (1): taxonomy
## dbl (317): #OTU ID, BU-RT-01_R1, BU-RT-01_R1_bracken_genuses, BU-RT-01_R2, B...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
metag1%>%
 select_and_rename_cols("G")%>%
 decontaminate -> metag1_decont_table
## Rows: 4826 Columns: 318
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr   (1): taxonomy
## dbl (317): #OTU ID, BU-RT-01_R1, BU-RT-01_R1_bracken_genuses, BU-RT-01_R2, B...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
metag1_decont_table%>%
  full_join( metab1_decont_table, by = join_by(OTU == OTU, kingdom==kingdom, phylum == phylum,
                                               class == class, order == order, family == family,
                                               genus == genus,  species == species, taxonomy == taxonomy))%>%
   mutate(across(where(is.numeric), ~ replace(., is.na(.), 0))) -> meta_g_b_decond_table
metab1_decont_table%>%
  select(starts_with("BU"))%>%rowSums
##   [1]   28932    3369      30    8186    6592      57     209   91909   44351
##  [10]    5604    5758     812     765      68     209      19    7587    7951
##  [19]     247      25    1188      50     180      54     130      49   17893
##  [28]      77      90      19     147     161   14638    1452      22     167
##  [37]    2577      90     334     317    1933      34     297      22     326
##  [46]     106     629    4415      61    2383    1275     218     280      80
##  [55]      36   62087   46571    3258    1308      73   51053   64149    1471
##  [64]     102      36      83  489030   57755   37880  407873  794082      10
##  [73]   22892 5428983     198      50   16726    3576    3470    7167      18
##  [82]   13771  314935    1339     846    1017      70   20259   21683     391
##  [91]       9      85      80      19      48    4833     140    3044  607762
## [100]   59109   14636    6176     103     365      44   10767    7671    2602
## [109]     350      56     942     148     155   73164    2611 1929676 1041241
## [118] 2678267    2994      19  804644    1462  116227 1347772    2651 1094662
## [127]    1199    5239      37      26   14507      23   13682    8104     972
## [136]      66   14745      66     461      28     837    2327     125    1940
## [145]   10278   12353    1104      54      16   10053      27      16  743136
## [154]     178     279     169    4574    1012    1023     103      58   40863
## [163]     157      61     930      80     125     129      29     680    7606
## [172]    3588    4357     237     316     511     136    1288      39      13
## [181]     738    4935      19     138      19  159889  107040     494     991
## [190]     969      25     637     236     113      40     101   22404     221
## [199]     248    1744    4269    6702    2594   39880    4619      81    7043
## [208]      15     218      43     256     878     533      60      86    5233
## [217]   23902   10363    8733    3190    6354      16      60     474    7670
## [226]     936     873    3994     688    7124     372     135      29     115
## [235]    5880      53      10    4166     753      10   23445      15     160
## [244]    8233       9  196282      85    3333      32      77   11821    1333
## [253]      18      56      17   75255    1052     161    2441     104      20
## [262]      44      12      90     165      24      21      25      10     201

Is it normal that some OTU have 0 reads ?

Rarefaction function

rarefaction.func = function(df,rar_sample, rarcur = T){
  
  
  if(rarcur){ # {{ if rarefaction curve needed}}
    metab1_decont_table%>%
  select(starts_with("BU"))%>%
  t()%>%rarecurve(step = 20, sample = rar_sample, col = "blue", cex = 0.6)
  }
   # {{rarefaction}}
  df%>%
    select(starts_with("BU"))%>%
    t()%>%
    rrarefy(rar_sample)%>%
    t()%>%
    bind_cols(
      df%>%select(!starts_with("BU"))
    )-> df_rarefy
  return(df_rarefy)
}

Which value to rarefy ?

meta_g_b_decond_table%>%
  select(starts_with("BU"))%>%
  t()%>%rowSums%>%sort%>%log10%>%plot(1:length(.),.)

Gap around 10^3.8 => rarefy at this value

Rarefaction

meta_g_b_decond_table%>%rarefaction.func(rar_sample =round(10^3.8), rarcur = T) -> meta_g_b_decond_table_rare
## Warning in rarecurve(., step = 20, sample = rar_sample, col = "blue", cex =
## 0.6): most observed count data have counts 1, but smallest count is 5
## Warning in rrarefy(., rar_sample): function should be used for observed counts,
## but smallest count is 5

meta_g_b_decond_table_rare$BU_TS_15
## Warning: Unknown or uninitialised column: `BU_TS_15`.
## NULL
meta_g_b_decond_table$BU_TS_15
## Warning: Unknown or uninitialised column: `BU_TS_15`.
## NULL

##Coinertia

Fist PCA for each method

meta_g_b_decond_table_rare%>%
  column_to_rownames("OTU")%>%
  select(ends_with("R1_B"))%>%
  select(order(colnames(.)))%>%
  decostand(method = "hellinger")%>%
  dudi.pca( scale = TRUE, scan = FALSE, nf = 3) -> dudi_b_r1

meta_g_b_decond_table_rare%>%
  column_to_rownames("OTU")%>%
  select(ends_with("R2_B"))%>%
  select(order(colnames(.)))%>%
  decostand(method = "hellinger")%>%
  dudi.pca( scale = TRUE, scan = FALSE, nf = 3) -> dudi_b_r2

meta_g_b_decond_table_rare%>%
  column_to_rownames("OTU")%>%
  select(ends_with("R1_G"))%>%
  select(order(colnames(.)))%>%
  decostand(method = "hellinger")%>%
  dudi.pca( scale = TRUE, scan = FALSE, nf = 3) -> dudi_g_r1

meta_g_b_decond_table_rare%>%
  column_to_rownames("OTU")%>%
  select(ends_with("R2_G"))%>%
  select(order(colnames(.)))%>%
  decostand(method = "hellinger")%>%
  dudi.pca( scale = TRUE, scan = FALSE, nf = 3) -> dudi_g_r2
# Disjoint R1 and R2 and decontaminate

I’m doing the Hellinger transformation on the matrix (OTU x site). Usually the transformation (and the coinertia by extension) is done on the transpose of this matrix (site x species/OUT).It is because most of the time we are interested in the difference of species composition in sites. However here we are interested in the species detection difference, in other word we are looking for difference of species abundances in sites

/! Maybe do separate or merge Hellinger transformation on the 2 df ?

Better graphical representation function

### ---- better graph representation for coinertia
coin.graph = function(coin_obj,meta_obj, brin, method){
  # {{recover coordinate in the ordination space of the 2 method to compaire}}
  cbind(coin_obj$mX,coin_obj$mY) ->  df_coin_pos
  colnames(df_coin_pos)=c("metho1_x", "metho1_y", "metho2_x", "metho2_y")
  
  df_coin_pos%>%
    rownames_to_column("OTU")%>%
    mutate(OTU = as.numeric(OTU))%>%
     # {{recover OTU and genus information}}
    left_join(meta_obj%>%select(OTU, genus), by = "OTU")%>%
    # {{Find taxa with the higher and lower diff (lower and higer distance in the ordination space between metho1(x,y) and metho2(x,y) )}}
    mutate(dist =  sqrt(rowSums((coin1$mX-coin1$mY)^2)))%>%
    arrange(desc(dist)) %>%
    slice(c(1:10, (n() - 9):n()))%>%
    # {{add a colums to facet_wrap}}
    mutate(diff = c(rep("Strong",10 ), rep("Weak", 10)))%>%

    ggplot()+
    facet_wrap(~diff)+
    geom_label_repel(aes(x= metho1_x, y = metho1_y ,label = genus))+
    geom_segment(aes(x = metho1_x, y = metho1_y, xend = metho2_x, yend = metho2_y),
                 arrow = arrow(length = unit(0.3, "cm"), type = "closed"), cex =1)+
    
    labs(x="X",y = "Y", title = paste("Genera estimations differences between", brin[1],method[1], "(arrow tail) and", brin[2],method[2], "(arrow head) across 80 samples"))+
    main_theme
    }

Metabarcoding R1 vs R2

coin1 <- coinertia(dudi_b_r1,dudi_b_r2, scan = FALSE, nf = 2)

summary(coin1)
## Coinertia analysis
## 
## Class: coinertia dudi
## Call: coinertia(dudiX = dudi_b_r1, dudiY = dudi_b_r2, scannf = FALSE, 
##     nf = 2)
## 
## Total inertia: 331.3
## 
## Eigenvalues:
##     Ax1     Ax2     Ax3     Ax4     Ax5 
## 304.687   9.059   2.714   1.264   1.013 
## 
## Projected inertia (%):
##     Ax1     Ax2     Ax3     Ax4     Ax5 
## 91.9565  2.7340  0.8191  0.3816  0.3058 
## 
## Cumulative projected inertia (%):
##     Ax1   Ax1:2   Ax1:3   Ax1:4   Ax1:5 
##   91.96   94.69   95.51   95.89   96.20 
## 
## (Only 5 dimensions (out of 77) are shown)
## 
## Eigenvalues decomposition:
##          eig     covar      sdX      sdY      corr
## 1 304.686922 17.455283 4.762115 4.667198 0.7853636
## 2   9.058705  3.009768 2.063545 1.627278 0.8963082
## 
## Inertia & coinertia X (dudi_b_r1):
##     inertia      max     ratio
## 1  22.67774 22.79292 0.9949467
## 12 26.93596 27.08917 0.9943440
## 
## Inertia & coinertia Y (dudi_b_r2):
##     inertia      max     ratio
## 1  21.78274 21.97014 0.9914702
## 12 24.43078 24.68268 0.9897941
## 
## RV:
##  0.5854531
plot(coin1)

coin.graph(coin1,meta_obj= meta_g_b_decond_table_rare, brin = c("R1","R2"), method = c("metaB", "metaB"))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: ggrepel: 10 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Metagenomic R1 vs R2

coin2 <- coinertia(dudi_g_r1,dudi_g_r2, scan = FALSE, nf = 2)

summary(coin2)
## Coinertia analysis
## 
## Class: coinertia dudi
## Call: coinertia(dudiX = dudi_g_r1, dudiY = dudi_g_r2, scannf = FALSE, 
##     nf = 2)
## 
## Total inertia: 354.7
## 
## Eigenvalues:
##     Ax1     Ax2     Ax3     Ax4     Ax5 
## 327.938   9.690   2.644   1.307   1.134 
## 
## Projected inertia (%):
##     Ax1     Ax2     Ax3     Ax4     Ax5 
## 92.4675  2.7324  0.7456  0.3685  0.3198 
## 
## Cumulative projected inertia (%):
##     Ax1   Ax1:2   Ax1:3   Ax1:4   Ax1:5 
##   92.47   95.20   95.95   96.31   96.63 
## 
## (Only 5 dimensions (out of 77) are shown)
## 
## Eigenvalues decomposition:
##          eig     covar      sdX      sdY      corr
## 1 327.938234 18.109065 4.778757 4.742291 0.7990847
## 2   9.690353  3.112933 2.057904 1.700255 0.8896736
## 
## Inertia & coinertia X (dudi_g_r1):
##     inertia      max     ratio
## 1  22.83652 22.93899 0.9955330
## 12 27.07149 27.23622 0.9939518
## 
## Inertia & coinertia Y (dudi_g_r2):
##     inertia      max     ratio
## 1  22.48933 22.70778 0.9903798
## 12 25.38020 25.66096 0.9890585
## 
## RV:
##  0.6054525
plot(coin2)

coin.graph(coin2,meta_obj= meta_g_b_decond_table_rare, brin = c("R1","R2"), method = c("metaG", "metaG"))
## Warning: ggrepel: 10 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Metabarcoding R1 vs Metagenomic R1

coin3 <- coinertia(dudi_b_r1,dudi_g_r1, scan = FALSE, nf = 2)

summary(coin3)
## Coinertia analysis
## 
## Class: coinertia dudi
## Call: coinertia(dudiX = dudi_b_r1, dudiY = dudi_g_r1, scannf = FALSE, 
##     nf = 2)
## 
## Total inertia: 590.6
## 
## Eigenvalues:
##     Ax1     Ax2     Ax3     Ax4     Ax5 
## 522.582  18.326   5.195   3.310   2.431 
## 
## Projected inertia (%):
##     Ax1     Ax2     Ax3     Ax4     Ax5 
## 88.4900  3.1032  0.8796  0.5606  0.4116 
## 
## Cumulative projected inertia (%):
##     Ax1   Ax1:2   Ax1:3   Ax1:4   Ax1:5 
##   88.49   91.59   92.47   93.03   93.44 
## 
## (Only 5 dimensions (out of 79) are shown)
## 
## Eigenvalues decomposition:
##         eig     covar      sdX      sdY      corr
## 1 522.58164 22.860045 4.774173 4.789446 0.9997551
## 2  18.32606  4.280895 2.072342 2.072574 0.9966972
## 
## Inertia & coinertia X (dudi_b_r1):
##     inertia      max     ratio
## 1  22.79273 22.79292 0.9999917
## 12 27.08733 27.08917 0.9999318
## 
## Inertia & coinertia Y (dudi_g_r1):
##     inertia      max     ratio
## 1  22.93879 22.93899 0.9999914
## 12 27.23436 27.23622 0.9999315
## 
## RV:
##  0.9967147
plot(coin3)

coin.graph(coin3,meta_obj= meta_g_b_decond_table_rare, brin = c("R1","R1"), method = c("metaB", "metaG"))
## Warning: ggrepel: 10 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Metabarcoding R2 vs Metagenomic R2

coin4 <- coinertia(dudi_b_r2,dudi_g_r2, scan = FALSE, nf = 2)

summary(coin4)
## Coinertia analysis
## 
## Class: coinertia dudi
## Call: coinertia(dudiX = dudi_b_r2, dudiY = dudi_g_r2, scannf = FALSE, 
##     nf = 2)
## 
## Total inertia: 518.2
## 
## Eigenvalues:
##     Ax1     Ax2     Ax3     Ax4     Ax5 
## 489.144   6.432   2.634   1.519   1.189 
## 
## Projected inertia (%):
##     Ax1     Ax2     Ax3     Ax4     Ax5 
## 94.4003  1.2414  0.5084  0.2932  0.2295 
## 
## Cumulative projected inertia (%):
##     Ax1   Ax1:2   Ax1:3   Ax1:4   Ax1:5 
##   94.40   95.64   96.15   96.44   96.67 
## 
## (Only 5 dimensions (out of 77) are shown)
## 
## Eigenvalues decomposition:
##          eig     covar      sdX      sdY      corr
## 1 489.144172 22.116604 4.686695 4.764793 0.9903933
## 2   6.432355  2.536209 1.620178 1.699920 0.9208603
## 
## Inertia & coinertia X (dudi_b_r2):
##     inertia      max     ratio
## 1  21.96511 21.97014 0.9997710
## 12 24.59009 24.68268 0.9962486
## 
## Inertia & coinertia Y (dudi_g_r2):
##     inertia      max     ratio
## 1  22.70325 22.70778 0.9998006
## 12 25.59298 25.66096 0.9973507
## 
## RV:
##  0.9260847
plot(coin4)

coin.graph(coin4,meta_obj= meta_g_b_decond_table_rare, brin = c("R2","R2"), method = c("metaB", "metaG"))
## Warning: ggrepel: 8 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Metacoder

Function that make the metacoder object that contain the taxonomic comparison tree

make.tax.tree.comp = function(df,method){
  meta_g_b_decond_table_rare %>%
  select(OTU, kingdom, phylum, class, order, family, genus, species, taxonomy, ends_with(method[1]), ends_with(method[2]))%>%
  # {{standardization of the table produce by the "method 1" (first method put in the vector)}}
  mutate(across(ends_with(method[1]), ~ sqrt(. / rowSums(across(ends_with(method[1])))), .names = "hellinger_{col}"), 
  # {{standardization of the table produce by the "method 2" (second method put in the vector)}}       
  across(ends_with(method[2]), ~ sqrt(. / rowSums(across(ends_with(method[2])))), .names = "hellinger_{col}"),.keep = "unused")%>%
  # {{remove species that don't appear in the 2 method that we compare in this chunk}}
  mutate(total = rowSums(across(starts_with("hellinger_") )))%>%
  filter(total !=0 )%>%
  # {{create metacoder object}}
  metacoder::parse_tax_data(class_cols = "taxonomy", # The column in the input table
                      class_sep = "; ",
                      class_regex = "^([a-z]{0,1})_{0,2}(.*)$",
                      class_key = c("tax_rank" = "taxon_rank", "name" = "taxon_name")) -> obj
  
  # {{compute abudance (standadize by hellinger) per taxon for the 2 methode}}
  obj$data$tax_abund <- metacoder::calc_taxon_abund(obj, "tax_data",
                                       cols = startsWith(colnames(obj$data$tax_data),"hellinger"))
  
  obj$data$tax_abund_group <- metacoder::calc_taxon_abund(obj, "tax_data",
                                       cols = startsWith(colnames(obj$data$tax_data),"hellinger"),
                                                         groups = str_extract(str_subset(colnames(obj$data$tax_data), "_R\\d_[A-Z+]"), "R\\d_[A-Z+]"))
  
  obj$data$diff_table = metacoder::compare_groups(obj, "tax_abund",
                                       cols = (startsWith(colnames(obj$data$tax_abund),"hellinger")),
                                                         groups = str_extract(str_subset(colnames(obj$data$tax_abund), "_R\\d_[A-Z+]"), "R\\d_[A-Z+]"))
  obj = mutate_obs(obj, "diff_table",
                  wilcox_p_value = p.adjust(wilcox_p_value, method = "fdr"))
  # {{Ns. set to 0}}
  obj$data$diff_table%>%
    mutate(median_diff = case_when(wilcox_p_value > 0.05 ~0,
                      .default  = median_diff),
           log2_median_ratio = case_when(wilcox_p_value > 0.05 ~0,
                      .default  = median_diff))
  return(obj)
}

Metabarcoding R1 vs R2

meta_g_b_decond_table_rare%>%
  make.tax.tree.comp(method = c("R1_B","R2_B")) -> obj1
## Summing per-taxon counts from 158 columns for 627 taxa
## Summing per-taxon counts from 158 columns in 2 groups for 627 taxa
obj1%>%
  filter_taxa(taxon_ranks == "o", supertaxa = TRUE)%>%
  heat_tree(node_label = gsub(pattern = "\\[|\\]", replacement = "", taxon_names),
            node_size = R1_B+R2_B,
            node_color = R1_B-R2_B,
            node_color_range = c("blue", "gray", "red"),
            node_color_interval = c(-max(abs(R1_B-R2_B)), max(abs(R1_B-R2_B))),
            node_color_axis_label = "Standardized abudance \n difference across site (R1_B-R2_B)",
            node_size_axis_label = "OTU abudance sum per taxon\n (standard hellinger)",
            layout = "davidson-harel", initial_layout = "reingold-tilford")

Here we see that in the phylum Protobacteia the class Gammaprotobacteria is mostly detected by R1 and Alphaprotobacteria is mostly detected by R1. Fimicutes and Actinobacteria are better detected by R1

Better version with statistical significance

obj1%>%
  filter_taxa(taxon_ranks == "o", supertaxa = TRUE)%>%
  heat_tree(node_label = taxon_names,
          node_size = n_obs, # number of OTUs
          node_color = log2_median_ratio, # difference between groups
          node_color_interval = c(-10, 10), # symmetric interval
          node_color_range = c("cyan", "gray", "magenta"), # diverging colors
          node_size_axis_label = "OTU count",
          node_color_axis_label = "Log 2 ratio of median counts",
          layout = "davidson-harel", initial_layout = "reingold-tilford",
          title = paste(unique(treatment_1) ,"vs", unique(treatment_2)))

obj1 %>%
  mutate_obs("cleaned_names", gsub(taxon_names, pattern = "\\[|\\]", replacement = "")) %>%
  metacoder::filter_taxa(grepl(cleaned_names, pattern = "^[a-zA-Z]+$")) %>%
  heat_tree(node_label = cleaned_names,
            node_size = n_obs, # number of OTUs
            node_color = log2_median_ratio, # difference between groups
            node_color_interval = c(-10, 10), # symmetric interval
            node_color_range = c("cyan", "gray", "magenta"), # diverging colors
            node_size_axis_label = "OTU count",
            node_color_axis_label = "Log 2 ratio of median counts",
            layout = "da", initial_layout = "re", # good layout for large trees
            title = paste(unique(treatment_1) ,"vs", unique(treatment_2)))
## Adding a new "character" vector of length 627.

obj1%>%
  filter_taxa(taxon_ranks == "o", supertaxa = TRUE)%>%
  heat_tree(node_label = taxon_names,
          node_size = n_obs, # number of OTUs
          node_color = median_diff, # difference between groups
          node_color_interval = c(-max(abs(median_diff)), max(abs(median_diff))), # symmetric interval
          node_color_range = c("cyan", "gray", "magenta"), # diverging colors
          node_size_axis_label = "OTU count",
          node_color_axis_label = "Log 2 ratio of median counts",
          layout = "davidson-harel", initial_layout = "reingold-tilford",
          title = paste(unique(treatment_1) ,"vs", unique(treatment_2)))

Metagenomic R1 vs R2

meta_g_b_decond_table_rare%>%
  make.tax.tree.comp(method = c("R1_G","R2_G")) -> obj2
## Summing per-taxon counts from 158 columns for 599 taxa
## Summing per-taxon counts from 158 columns in 2 groups for 599 taxa
obj2%>%
  filter_taxa(taxon_ranks == "o", supertaxa = TRUE)%>%
  heat_tree(node_label = gsub(pattern = "\\[|\\]", replacement = "", taxon_names),
            node_size = R1_G+R2_G,
            node_color = R1_G-R2_G,
            node_color_range = c("blue", "gray", "red"),
            node_color_interval = c(-max(abs(R1_G-R2_G)), max(abs(R1_G-R2_G))),
            node_color_axis_label = "Standardized abudance \n difference across site (R1_G-R2_G)",
            node_size_axis_label = "OTU abudance sum per taxon\n (standard hellinger)",
            layout = "davidson-harel", initial_layout = "reingold-tilford")

Different pattern for metaG than metaB here in phylum Protobacteia the class Gammaprotobacteria and Alphaprotobacteria are both more detected by R1. Fimicutes and Actinobacteria are also better detected by R1

Better version with statistical significance

obj2%>%
  filter_taxa(taxon_ranks == "o", supertaxa = TRUE)%>%
  heat_tree(node_label = taxon_names,
          node_size = n_obs, # number of OTUs
          node_color = log2_median_ratio, # difference between groups
          node_color_interval = c(-10, 10), # symmetric interval
          node_color_range = c("cyan", "gray", "magenta"), # diverging colors
          node_size_axis_label = "OTU count",
          node_color_axis_label = "Log 2 ratio of median counts",
          layout = "davidson-harel", initial_layout = "reingold-tilford",
          title = paste(unique(treatment_1) ,"vs", unique(treatment_2)))

obj2 %>%
  mutate_obs("cleaned_names", gsub(taxon_names, pattern = "\\[|\\]", replacement = "")) %>%
  metacoder::filter_taxa(grepl(cleaned_names, pattern = "^[a-zA-Z]+$")) %>%
  heat_tree(node_label = cleaned_names,
            node_size = n_obs, # number of OTUs
            node_color = log2_median_ratio, # difference between groups
            node_color_interval = c(-10, 10), # symmetric interval
            node_color_range = c("cyan", "gray", "magenta"), # diverging colors
            node_size_axis_label = "OTU count",
            node_color_axis_label = "Log 2 ratio of median counts",
            layout = "da", initial_layout = "re", # good layout for large trees
            title = paste(unique(treatment_1) ,"vs", unique(treatment_2)))
## Adding a new "character" vector of length 599.

obj2%>%
  filter_taxa(taxon_ranks == "o", supertaxa = TRUE)%>%
  heat_tree(node_label = taxon_names,
          node_size = n_obs, # number of OTUs
          node_color = median_diff, # difference between groups
          node_color_interval = c(-max(abs(median_diff)), max(abs(median_diff))), # symmetric interval
          node_color_range = c("cyan", "gray", "magenta"), # diverging colors
          node_size_axis_label = "OTU count",
          node_color_axis_label = "Log 2 ratio of median counts",
          layout = "davidson-harel", initial_layout = "reingold-tilford",
          title = paste(unique(treatment_1) ,"vs", unique(treatment_2)))

### Metabarcoding R1 vs Metagenomic R1

meta_g_b_decond_table_rare%>%
  make.tax.tree.comp(method = c("R1_B","R1_G")) -> obj3
## Summing per-taxon counts from 158 columns for 745 taxa
## Summing per-taxon counts from 158 columns in 2 groups for 745 taxa
obj3%>%
  filter_taxa(taxon_ranks == "o", supertaxa = TRUE)%>%
  heat_tree(node_label = gsub(pattern = "\\[|\\]", replacement = "", taxon_names),
            node_size = R1_B+R1_G,
            node_color = R1_B-R1_G,
            node_color_range = c("blue", "gray", "red"),
            node_color_interval = c(-max(abs(R1_B - R1_G)), max(abs(R1_B - R1_G))),
            node_color_axis_label = "Standardized abudance \n difference across site (R1_B-R1_G)",
            node_size_axis_label = "OTU abudance sum per taxon\n (standard hellinger)",
            layout = "davidson-harel", initial_layout = "reingold-tilford")

Better version with statistical significance

obj3%>%
  filter_taxa(taxon_ranks == "o", supertaxa = TRUE)%>%
  heat_tree(node_label = taxon_names,
          node_size = n_obs, # number of OTUs
          node_color = log2_median_ratio, # difference between groups
          node_color_interval = c(-10,10), # symmetric interval
          node_color_range = c("cyan", "gray", "magenta"), # diverging colors
          node_size_axis_label = "OTU count",
          node_color_axis_label = "Log 2 ratio of median counts",
          layout = "davidson-harel", initial_layout = "reingold-tilford",
          title = paste(unique(treatment_1) ,"vs", unique(treatment_2)))

obj3 %>%
  mutate_obs("cleaned_names", gsub(taxon_names, pattern = "\\[|\\]", replacement = "")) %>%
  metacoder::filter_taxa(grepl(cleaned_names, pattern = "^[a-zA-Z]+$")) %>%
  heat_tree(node_label = cleaned_names,
            node_size = n_obs, # number of OTUs
            node_color = log2_median_ratio, # difference between groups
            node_color_interval = c(-10, 10), # symmetric interval
            node_color_range = c("cyan", "gray", "magenta"), # diverging colors
            node_size_axis_label = "OTU count",
            node_color_axis_label = "Log 2 ratio of median counts",
            layout = "da", initial_layout = "re", # good layout for large trees
            title = paste(unique(treatment_1) ,"vs", unique(treatment_2)))
## Adding a new "character" vector of length 745.

obj3%>%
  filter_taxa(taxon_ranks == "o", supertaxa = TRUE)%>%
  heat_tree(node_label = taxon_names,
          node_size = n_obs, # number of OTUs
          node_color = median_diff, # difference between groups
          node_color_interval = c(-max(abs(median_diff)), max(abs(median_diff))), # symmetric interval
          node_color_range = c("cyan", "gray", "magenta"), # diverging colors
          node_size_axis_label = "OTU count",
          node_color_axis_label = "Log 2 ratio of median counts",
          layout = "davidson-harel", initial_layout = "reingold-tilford",
          title = paste(unique(treatment_1) ,"vs", unique(treatment_2)))

Metabarcoding R2 vs Metagenomic R2

meta_g_b_decond_table_rare%>%
  make.tax.tree.comp(method = c("R2_B","R2_G")) -> obj4
## Summing per-taxon counts from 158 columns for 651 taxa
## Summing per-taxon counts from 158 columns in 2 groups for 651 taxa
obj4%>%
  filter_taxa(taxon_ranks == "o", supertaxa = TRUE)%>%
  heat_tree(node_label = gsub(pattern = "\\[|\\]", replacement = "", taxon_names),
            node_size = R2_B+R2_G,
            node_color = R2_B-R2_G,
            node_color_range = c("blue", "gray", "red"),
            node_color_interval = c(-max(abs(R2_B-R2_G)), max(abs(R2_B-R2_G))),
            node_color_axis_label = "Standardized abudance \n difference across site (R2_B-R2_G)",
            node_size_axis_label = "OTU abudance sum per taxon\n (standard hellinger)",
            layout = "davidson-harel", initial_layout = "reingold-tilford")

#### Better version with statistical significance

obj4%>%
  filter_taxa(taxon_ranks == "o", supertaxa = TRUE)%>%
  heat_tree(node_label = taxon_names,
          node_size = n_obs, # number of OTUs
          node_color = log2_median_ratio, # difference between groups
          node_color_interval = c(-10, 10), # symmetric interval
          node_color_range = c("cyan", "gray", "magenta"), # diverging colors
          node_size_axis_label = "OTU count",
          node_color_axis_label = "Log 2 ratio of median counts",
          layout = "davidson-harel", initial_layout = "reingold-tilford",
          title = paste(unique(treatment_1) ,"vs", unique(treatment_2)))

obj4 %>%
  mutate_obs("cleaned_names", gsub(taxon_names, pattern = "\\[|\\]", replacement = "")) %>%
  metacoder::filter_taxa(grepl(cleaned_names, pattern = "^[a-zA-Z]+$")) %>%
  heat_tree(node_label = cleaned_names,
            node_size = n_obs, # number of OTUs
            node_color = log2_median_ratio, # difference between groups
            node_color_interval = c(-10, 10), # symmetric interval
            node_color_range = c("cyan", "gray", "magenta"), # diverging colors
            node_size_axis_label = "OTU count",
            node_color_axis_label = "Log 2 ratio of median counts",
            layout = "da", initial_layout = "re", # good layout for large trees
            title = paste(unique(treatment_1) ,"vs", unique(treatment_2)))
## Adding a new "character" vector of length 651.

obj4%>%
  filter_taxa(taxon_ranks == "o", supertaxa = TRUE)%>%
  heat_tree(node_label = taxon_names,
          node_size = n_obs, # number of OTUs
          node_color = median_diff, # difference between groups
          node_color_interval = c(-max(abs(median_diff)), max(abs(median_diff))), # symmetric interval
          node_color_range = c("cyan", "gray", "magenta"), # diverging colors
          node_size_axis_label = "OTU count",
          node_color_axis_label = "Log 2 ratio of median counts",
          layout = "davidson-harel", initial_layout = "reingold-tilford",
          title = paste(unique(treatment_1) ,"vs", unique(treatment_2)))

Ralative abudances

meta_g_b_decond_table_rare%>%
  pivot_longer(starts_with("BU"), names_to = "ID", values_to = "abundance")%>%
  mutate(method = str_extract(ID, "R\\d+_[BG]"),
         soil_code = str_extract(ID, "^BU_[A-Z]+_\\d+"))%>%
  group_by(ID)%>%
  mutate(relative_abudance = abundance/sum(abundance))%>%
  ggplot()+
  facet_wrap(~soil_code)+
  geom_col(aes(method,relative_abudance  ,fill = order))+
  main_theme+
  theme(legend.position="none",
        axis.text.x =element_text(angle = 90,vjust = 1, hjust = 1, size = 6, face="italic"))

meta_g_b_decond_table_rare%>%
  pivot_longer(starts_with("BU"), names_to = "ID", values_to = "abundance")%>%
  mutate(method = str_extract(ID, "R\\d+_[BG]"),
         soil_code = str_extract(ID, "^BU_[A-Z]+_\\d+"))%>%
  group_by(ID)%>%
  mutate(relative_abudance = abundance/sum(abundance))%>%
  ungroup()%>%
  group_by(order, method)%>%
  summarise(relative_abudance_mean = mean(relative_abudance))%>%
  ggplot()+
  #facet_wrap(~order)+
  geom_raster(aes(method,order, fill = relative_abudance_mean))+
  main_theme+
  theme(legend.position="none",
        axis.text.x =element_text(angle = 90,vjust = 1, hjust = 1, size = 6, face="italic"))
## `summarise()` has grouped output by 'order'. You can override using the
## `.groups` argument.